suppressPackageStartupMessages(library(knitr))
suppressPackageStartupMessages(library(data.table))
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(datasauRus))
suppressPackageStartupMessages(library(skimr))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(ggthemes))
suppressPackageStartupMessages(library(ggpubr))
suppressPackageStartupMessages(library(leaflet))
suppressPackageStartupMessages(library(leaflet.extras2))Assignment1
Settings
idir <- "/Users/xinranwang/Documents/Course/25Fall/PM566/Data"EDA
(30 points) Given the formulated question from the assignment description, you will now conduct EDA Checklist items 1-5. First, download 2002 and 2022 data for all sites in California from the EPA Air Quality Data website, then read the data into R. For each of the two datasets, check the dimensions, headers, footers, variable names and variable types. Check the distribution of the key variable we are analyzing (PM). Write up a summary of all of your findings.
Note. Since
skimcollapse all number intonumeric, I also attached a table to distinguish between numerical typesSummary. This data frame includes measurements at the state level (California), county level (51 counties), and city level (199 monitoring sites). PM2.5 was recorded using 21 different methods (see Method.Description for details) and reported in units of µg/m3 LC.
read_df <- function(idir, filename, year){ df <- read.csv(paste0(idir, "/", filename)) %>% mutate(Year = year) return(df) }check_df <- function(df){ print(paste0("Dimension: ", paste(dim(df), collapse = " x "), "; # Rows: ", nrow(df), "; # Columns: ", ncol(df))) df_type <- data.frame(Type = sapply(df, function(x) paste(class(x), collapse = ", "))) kable(df_type, caption = "Variable types in df") }2002
df_2002 <- read_df(idir, "ad_viz_plotval_data_PM2.5_2002.csv", "2002") check_df(df_2002)[1] "Dimension: 15976 x 23; # Rows: 15976; # Columns: 23"Variable types in df Type Date character Source character Site.ID integer POC integer Daily.Mean.PM2.5.Concentration numeric Units character Daily.AQI.Value integer Local.Site.Name character Daily.Obs.Count integer Percent.Complete numeric AQS.Parameter.Code integer AQS.Parameter.Description character Method.Code integer Method.Description character CBSA.Code integer CBSA.Name character State.FIPS.Code integer State character County.FIPS.Code integer County character Site.Latitude numeric Site.Longitude numeric Year character skim(df_2002)Data summary Name df_2002 Number of rows 15976 Number of columns 23 _______________________ Column type frequency: character 10 numeric 13 ________________________ Group variables None Variable type: character
skim_variable n_missing complete_rate min max empty n_unique whitespace Date 0 1 10 10 0 365 0 Source 0 1 3 3 0 1 0 Units 0 1 8 8 0 1 0 Local.Site.Name 0 1 0 43 118 103 0 AQS.Parameter.Description 0 1 24 38 0 2 0 Method.Description 0 1 28 62 0 6 0 CBSA.Name 0 1 0 45 929 31 0 State 0 1 10 10 0 1 0 County 0 1 4 15 0 48 0 Year 0 1 4 4 0 1 0 Variable type: numeric
skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist Site.ID 0 1.00 60549600.12 289945.67 60010007.00 60290014.00 60590007.00 60731002.00 61131003.00 ▃▇▅▆▃ POC 0 1.00 1.58 1.25 1.00 1.00 1.00 1.00 6.00 ▇▁▁▁▁ Daily.Mean.PM2.5.Concentration 0 1.00 16.12 13.87 0.00 7.00 12.00 20.50 104.30 ▇▂▁▁▁ Daily.AQI.Value 0 1.00 59.28 31.95 0.00 39.00 56.00 72.00 185.00 ▃▇▂▁▁ Daily.Obs.Count 0 1.00 1.00 0.00 1.00 1.00 1.00 1.00 1.00 ▁▁▇▁▁ Percent.Complete 0 1.00 100.00 0.00 100.00 100.00 100.00 100.00 100.00 ▁▁▇▁▁ AQS.Parameter.Code 0 1.00 88215.26 181.01 88101.00 88101.00 88101.00 88502.00 88502.00 ▇▁▁▁▃ Method.Code 0 1.00 297.04 282.45 117.00 120.00 120.00 707.00 810.00 ▇▁▁▁▃ CBSA.Code 929 0.94 33269.99 11059.59 12540.00 23420.00 40140.00 41740.00 49700.00 ▃▂▃▇▁ State.FIPS.Code 0 1.00 6.00 0.00 6.00 6.00 6.00 6.00 6.00 ▁▁▇▁▁ County.FIPS.Code 0 1.00 54.78 28.98 1.00 29.00 59.00 73.00 113.00 ▃▇▅▆▃ Site.Latitude 0 1.00 36.00 2.28 32.63 34.07 35.36 37.77 41.71 ▇▃▅▃▁ Site.Longitude 0 1.00 -119.38 1.95 -124.16 -121.37 -119.06 -117.87 -115.48 ▁▇▇▇▅ kable(head(df_2002, 1), caption = "Head of df")Head of df Date Source Site.ID POC Daily.Mean.PM2.5.Concentration Units Daily.AQI.Value Local.Site.Name Daily.Obs.Count Percent.Complete AQS.Parameter.Code AQS.Parameter.Description Method.Code Method.Description CBSA.Code CBSA.Name State.FIPS.Code State County.FIPS.Code County Site.Latitude Site.Longitude Year 01/05/2002 AQS 60010007 1 25.1 ug/m3 LC 81 Livermore 1 100 88101 PM2.5 - Local Conditions 120 Andersen RAAS2.5-300 PM2.5 SEQ w/WINS 41860 San Francisco-Oakland-Hayward, CA 6 California 1 Alameda 37.68753 -121.7842 2002 kable(tail(df_2002, 1), caption = "Tail of df")Tail of df Date Source Site.ID POC Daily.Mean.PM2.5.Concentration Units Daily.AQI.Value Local.Site.Name Daily.Obs.Count Percent.Complete AQS.Parameter.Code AQS.Parameter.Description Method.Code Method.Description CBSA.Code CBSA.Name State.FIPS.Code State County.FIPS.Code County Site.Latitude Site.Longitude Year 15976 12/31/2002 AQS 61131003 1 6 ug/m3 LC 33 Woodland-Gibson Road 1 100 88101 PM2.5 - Local Conditions 117 R & P Model 2000 PM2.5 Sampler w/WINS 40900 Sacramento–Roseville–Arden-Arcade, CA 6 California 113 Yolo 38.66121 -121.7327 2002 2022
df_2022 <- read_df(idir, "ad_viz_plotval_data_PM2.5_2022.csv", "2022") check_df(df_2022)[1] "Dimension: 59918 x 23; # Rows: 59918; # Columns: 23"Variable types in df Type Date character Source character Site.ID integer POC integer Daily.Mean.PM2.5.Concentration numeric Units character Daily.AQI.Value integer Local.Site.Name character Daily.Obs.Count integer Percent.Complete numeric AQS.Parameter.Code integer AQS.Parameter.Description character Method.Code integer Method.Description character CBSA.Code integer CBSA.Name character State.FIPS.Code integer State character County.FIPS.Code integer County character Site.Latitude numeric Site.Longitude numeric Year character skim(df_2022)Data summary Name df_2022 Number of rows 59918 Number of columns 23 _______________________ Column type frequency: character 10 numeric 13 ________________________ Group variables None Variable type: character
skim_variable n_missing complete_rate min max empty n_unique whitespace Date 0 1 10 10 0 365 0 Source 0 1 3 3 0 1 0 Units 0 1 8 8 0 1 0 Local.Site.Name 0 1 3 42 0 165 0 AQS.Parameter.Description 0 1 24 38 0 2 0 Method.Description 0 1 24 62 0 18 0 CBSA.Name 0 1 0 45 4569 33 0 State 0 1 10 10 0 1 0 County 0 1 4 15 0 50 0 Year 0 1 4 4 0 1 0 Variable type: numeric
skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist Site.ID 0 1.00 60564224.91 290630.89 60010007.00 60290019.00 60631006.00 60731026.00 61131003.00 ▅▇▇▇▃ POC 0 1.00 3.77 4.87 1.00 1.00 3.00 3.00 24.00 ▇▁▁▁▁ Daily.Mean.PM2.5.Concentration 0 1.00 8.41 7.64 -6.70 4.10 6.80 10.70 302.50 ▇▁▁▁▁ Daily.AQI.Value 0 1.00 39.22 21.55 0.00 23.00 38.00 54.00 454.00 ▇▁▁▁▁ Daily.Obs.Count 0 1.00 1.00 0.00 1.00 1.00 1.00 1.00 1.00 ▁▁▇▁▁ Percent.Complete 0 1.00 100.00 0.00 100.00 100.00 100.00 100.00 100.00 ▁▁▇▁▁ AQS.Parameter.Code 0 1.00 88192.43 168.24 88101.00 88101.00 88101.00 88101.00 88502.00 ▇▁▁▁▂ Method.Code 0 1.00 337.99 253.07 143.00 170.00 170.00 707.00 810.00 ▇▁▁▁▃ CBSA.Code 4569 0.92 34970.64 10229.06 12540.00 31080.00 40140.00 41860.00 49700.00 ▂▂▂▇▂ State.FIPS.Code 0 1.00 6.00 0.00 6.00 6.00 6.00 6.00 6.00 ▁▁▇▁▁ County.FIPS.Code 0 1.00 56.28 29.06 1.00 29.00 63.00 73.00 113.00 ▅▇▇▇▃ Site.Latitude 0 1.00 36.25 2.31 32.58 34.07 36.49 37.96 41.76 ▇▃▇▅▁ Site.Longitude 0 1.00 -119.57 2.04 -124.20 -121.37 -119.59 -117.94 -115.48 ▁▇▆▇▃ kable(head(df_2022, 1), caption = "Head of df")Head of df Date Source Site.ID POC Daily.Mean.PM2.5.Concentration Units Daily.AQI.Value Local.Site.Name Daily.Obs.Count Percent.Complete AQS.Parameter.Code AQS.Parameter.Description Method.Code Method.Description CBSA.Code CBSA.Name State.FIPS.Code State County.FIPS.Code County Site.Latitude Site.Longitude Year 01/01/2022 AQS 60010007 3 12.7 ug/m3 LC 58 Livermore 1 100 88101 PM2.5 - Local Conditions 170 Met One BAM-1020 Mass Monitor w/VSCC 41860 San Francisco-Oakland-Hayward, CA 6 California 1 Alameda 37.68753 -121.7842 2022 kable(tail(df_2022, 1), caption = "Tail of df")Tail of df Date Source Site.ID POC Daily.Mean.PM2.5.Concentration Units Daily.AQI.Value Local.Site.Name Daily.Obs.Count Percent.Complete AQS.Parameter.Code AQS.Parameter.Description Method.Code Method.Description CBSA.Code CBSA.Name State.FIPS.Code State County.FIPS.Code County Site.Latitude Site.Longitude Year 59918 12/31/2022 AQS 61131003 1 1 ug/m3 LC 6 Woodland-Gibson Road 1 100 88101 PM2.5 - Local Conditions 145 R & P Model 2025 PM-2.5 Sequential Air Sampler w/VSCC 40900 Sacramento–Roseville–Arden-Arcade, CA 6 California 113 Yolo 38.66121 -121.7327 2022 identical(colnames(df_2002), colnames(df_2022))[1] TRUENote. Here, I combined data frames as they have identical column names.
df <- rbind(df_2002, df_2022) rm(df_2002, df_2022)Distribution of PM 2.5
Summary. There is a striking decrease in daily PM2.5 concentrations from 2002 to 2022. Mean and median PM 2.5 were cut in half. However, there are more outliers in 2022 and max PM 2.5 increased by 3 fold
df <- df %>% rename(PM2.5 = Daily.Mean.PM2.5.Concentration) df$Year <- factor(df$Year, levels = c("2002", "2022")) summary1 <- df %>% group_by(Year) %>% summarise( mean = round(mean(PM2.5, na.rm = T), 2), sd = round(sd(PM2.5, na.rm = TRUE), 2), min = min(PM2.5, na.rm = TRUE), q25 = quantile(PM2.5, 0.25, na.rm = TRUE), median = median(PM2.5, na.rm = TRUE), q75 = quantile(PM2.5, 0.75, na.rm = TRUE), max = max(PM2.5, na.rm = TRUE) ) kable(as.data.frame(summary1), align = "c")Year mean sd min q25 median q75 max 2002 16.12 13.87 0.0 7.0 12.0 20.5 104.3 2022 8.41 7.64 -6.7 4.1 6.8 10.7 302.5 (10 points) Combine the two years of data into one data frame. Use the
Datevariable to create a new column for year, which will serve as an identifier. Change the names of the key variables so that they are easier to refer to in your code.Note. (1) I’ve created new column for year when reading data, (2) I’ve renamed Daily.Mean.PM2.5.Concentration to PM2.5 in previous step (3) I’ve combined data in previous step (before plotting).
(20 points) Create a basic map (or maps) in
leafletshowing the locations of the monitoring sites, using different colors for each year. Summarize the spatial distribution of the sites. How does this distribution change from 2002 to 2022?Map (2002 vs 2022)
Summary. By 2022, more sites had been established, frequently clustering near the 2002 sites (see map above). The number of sites increased 3.75-fold from 2002 to 2022 (rising from 15,978 to 59,918 sites; see histrogram and table in Question 4).
# Map #scales::hue_pal()(2) pal <- leaflet::colorFactor(c("#F8766D", "#00BFC4"), domain = levels(df$Year)) leaflet(df) %>% addMapPane("left", zIndex = 410) %>% addMapPane("right", zIndex = 410) %>% addProviderTiles('OpenStreetMap', options = pathOptions(pane = "left"), layerId = "layer_2002") %>% addProviderTiles('OpenStreetMap', options = pathOptions(pane = "right"), layerId = "layer_2022") %>% addCircles(data = df %>% filter(Year == 2002), lat = ~Site.Latitude, lng = ~Site.Longitude, fillColor = "#F8766D", color = "black", stroke = TRUE, weight = 0.2, opacity = 0.5, fillOpacity = 1, radius = 100, options = pathOptions(pane = "left"), group = "2002") %>% addCircles(data = df %>% filter(Year == 2022), lat = ~Site.Latitude, lng = ~Site.Longitude, fillColor = "#00BFC4", color = "black", stroke = TRUE, weight = 0.2, opacity = 0.5, fillOpacity = 1, radius = 100, options = pathOptions(pane = "right"), group = "2022") %>% addLegend("topright", title = "Location of Monitor Sites", pal = pal, values = ~Year, labels = ~Year ) %>% addLayersControl(overlayGroups = ~c("2002", "2022"), options = layersControlOptions(collapsed = F)) %>% addSidebyside( layerId = "sidecontrols", leftId = "layer_2002", rightId = "layer_2022")#https://rdrr.io/cran/leaflet.extras2/man/addSidebyside.html(10 points) Check for any data issues such as missing or implausible values of PM in the combined dataset. Calculate the proportion of missing/implausible values for each year and report any temporal patterns you see in these observations.
Missing values. There are no missing values in either year.
Implausible values. In 2022, 215 out of 59,918 observations (0.36%) were implausible (defined as PM2.5 < 0).
Temporal Patterns. The number of implausible observations increased in 2022, although the count remained very small overall).
summary2 <- df %>% group_by(Year) %>% summarise( `# Total` = n(), `# Missing` = sum(is.na(PM2.5)), `# Implausible` = sum(PM2.5 < 0, na.rm = TRUE) ) %>% mutate( `% Missing` = paste0(round((`# Missing` / `# Total`) * 100, 2), "%"), `% Implausible` = paste0(round((`# Implausible` / `# Total`) * 100, 2), "%")) %>% select(`# Total`, `# Missing`, `% Missing`, `# Implausible`, `% Implausible`) kable(as.data.frame(summary2), align = "c")# Total # Missing % Missing # Implausible % Implausible 15976 0 0% 0 0% 59918 0 0% 215 0.36% # Count of PM 2.5 Concentration df$Year <- factor(df$Year, levels = c("2022", "2002")) # 2002 will be drawn last (on top) ggplot(df, aes(x = PM2.5, fill = Year, color = Year)) + geom_histogram(bins = 100, alpha = 0.5, position = "identity") + labs(title = "Histrogram of Daily PM 2.5 Concentration", x = "PM2.5 Concentration", y = "Count") + scale_color_manual(name = "Year", values = c("2002" = "#F8766D", "2022" = "#00BFC4")) + scale_fill_manual(name = "Year", values = c("2002" = "#F8766D", "2022" = "#00BFC4")) + theme_classic()df$Year <- factor(df$Year, levels = c("2002", "2022")) # 2002 will be drawn last (on top)(30 points) Explore the main question of interest at three different levels of spatial resolution. Create exploratory plots (e.g. boxplots, histograms, line plots, violin plots) and summary statistics that best suit each level of data. Be sure to write up explanations of what you observe in these data.
Level 1: State. Examine the primary question for the entire state.
Level 2: County. Examine the primary question for every county in California.
Level 3: City. Restrict the data to sites in Los Angeles county and examine the primary question for every site.
Level 1: State
Summary. The mean of daily PM 2.5 in 2022 (mean: 8.41; sd: 7.64) is lower than than 2002 (mean: 16.12; sd: 13.87), despite more outliers.
summary3 <- df %>% group_by(Year) %>% summarise( n = n(), mean = round(mean(PM2.5, na.rm = T), 2), sd = round(sd(PM2.5, na.rm = T), 2)) %>% mutate(`Mean (SD)`= paste0(mean, " (", sd, ")")) %>% select(Year, `Mean (SD)`) %>% arrange(Year) kable(as.data.frame(summary3), align = "c")Year Mean (SD) 2002 16.12 (13.87) 2022 8.41 (7.64) ggplot(df, aes_string(x = "Year", y = "PM2.5", fill = "Year", color = "Year")) + geom_boxplot(color = "black", size = 0.2) + labs(title = paste0(" Daily PM2.5 Concentration (2002 vs 2022) in California"), x = "Year", y = paste0("Daily PM2.5 Concentration")) + scale_color_manual(name = "Year", values = c("2002" = "#F8766D", "2022" = "#00BFC4")) + theme_minimal() + theme(legend.position = "bottom", legend.box = "horizontal", legend.justification = "center")Warning: `aes_string()` was deprecated in ggplot2 3.0.0. ℹ Please use tidy evaluation idioms with `aes()`. ℹ See also `vignette("ggplot2-in-packages")` for more information.Warning: No shared levels found between `names(values)` of the manual scale and the data's colour values.Level 2: County
Summary. The mean of daily PM 2.5 for most counties in California in 2022 are lower than that of 2000, except for Trinity, Siskiyou, Mono, and Del Norte.
summary4 <- df %>% group_by(County, Year) %>% summarise( n = n(), mean = round(mean(PM2.5, na.rm = T), 2), sd = round(sd(PM2.5, na.rm = TRUE), 2)) %>% mutate(`Mean (SD)`= paste0(mean, " (", sd, ")")) %>% select(Year, County, `Mean (SD)`) %>% pivot_wider( names_from = Year, values_from = `Mean (SD)`, names_prefix = "Mean (SD) in ") %>% select(County, `Mean (SD) in 2002`, `Mean (SD) in 2022`)`summarise()` has grouped output by 'County'. You can override using the `.groups` argument.kable(as.data.frame(summary4), align = "c")County Mean (SD) in 2002 Mean (SD) in 2022 Alameda 14.25 (11.43) 8.2 (4.95) Butte 14.76 (11.66) 6.19 (5.79) Calaveras 9.9 (6.5) 6.04 (4.1) Colusa 11.73 (10.01) 7.61 (4.76) Contra Costa 15.09 (14.47) 8.24 (4.93) Del Norte 3.82 (2.56) 4.97 (3.58) El Dorado 4.91 (4.07) 4.07 (9.18) Fresno 19.93 (18.91) 10.19 (8.3) Glenn NA 5.34 (4.98) Humboldt 7.79 (4.59) 6.76 (4.35) Imperial 12.71 (7.7) 9.55 (6.39) Inyo 7.14 (10.49) 5.55 (5.47) Kern 23 (18.72) 9.88 (9.91) Kings 24.7 (19.06) 14.4 (10.78) Lake 6.5 (10.88) 4.26 (3.12) Los Angeles 19.66 (11.88) 10.97 (5.24) Madera NA 10.39 (7.26) Marin 7.31 (4.87) 6.54 (4.17) Mariposa 9.48 (5.97) 9.47 (17.78) Mendocino 8.85 (8.79) 10.14 (5.19) Merced 21.54 (15.32) 10.06 (7.32) Modoc 5 (0) NA Mono 2.68 (2.57) 4.43 (6.09) Monterey 8.97 (4.53) 4.75 (3.21) Nevada 7.36 (3.41) 7.59 (15.86) Orange 17.83 (10.55) 9.24 (4.43) Placer 13.1 (10.24) 7.28 (15.48) Plumas 12.86 (8.94) 11.23 (10.33) Riverside 22.37 (16.1) 9.13 (6.27) Sacramento 15.17 (14.51) 8.25 (7.54) San Benito 4.99 (2.92) 4.71 (2.69) San Bernardino 16.3 (13.08) 9.42 (5.19) San Diego 15.27 (7.65) 8.3 (4.36) San Francisco 15.31 (14.21) 6.78 (5.14) San Joaquin 16.76 (12.26) 7.9 (7.81) San Luis Obispo 8.88 (5.15) 7.1 (4.52) San Mateo 12.24 (8.91) 6.8 (4.94) Santa Barbara 7.04 (3.92) 6.06 (3.51) Santa Clara 14.83 (10.65) 8.77 (5.59) Santa Cruz 8.55 (4.34) 5.23 (3.74) Shasta 7.25 (8.38) 5.04 (4.67) Siskiyou 2.69 (2.06) 7.6 (17.3) Solano 15.48 (14.15) 7.9 (5.46) Sonoma 11.13 (9.5) 6.38 (4.29) Stanislaus 19.66 (17.09) 12.12 (8.42) Sutter 12.82 (10.2) 10.04 (6.6) Tehama NA 5.87 (4.5) Trinity 2.78 (2.4) 10.73 (23.1) Tulare 23.1 (16.15) 10.41 (7.53) Ventura 13.4 (7.24) 6.85 (3.83) Yolo 10.58 (8.86) 5.44 (5.29) df_couty <- df %>% group_by(Year, County) %>% summarise( n = n(), mean_PM2.5 = mean(PM2.5, na.rm = T), sd_PM2.5 = sd(PM2.5, na.rm = T), ) %>% mutate(se_PM2.5 = sd_PM2.5 / sqrt(n))`summarise()` has grouped output by 'Year'. You can override using the `.groups` argument.ggplot(df_couty, aes(y = County, x = mean_PM2.5, xmin = mean_PM2.5 - 1.96 * se_PM2.5, xmax = mean_PM2.5 + 1.96 * se_PM2.5, color = Year, group = Year)) + geom_pointrange(size = 0.3, fatten = 1.2) + labs(title = paste0("Daily PM2.5 Concentration (2002 vs 2022), by County"), x = "Daily PM2.5 Concentration (95% CI)", y = "County",) + scale_color_manual(name = "Year", values = c("2002" = "#F8766D", "2022" = "#00BFC4")) + theme_minimal() + theme(legend.position = "bottom", legend.box = "horizontal", legend.justification = "center")Level 3: Sites in LA County
Summary. The mean of daily PM 2.5 for all cities in Los Angeles county in 2022 are lower than that of 2000
summary5 <- df %>% filter(County == "Los Angeles", Year %in% c("2002", "2022"), Local.Site.Name != "") %>% group_by(Year, Local.Site.Name) %>% summarise( n = n(), mean = round(mean(PM2.5, na.rm = T), 2), sd = round(sd(PM2.5, na.rm = TRUE), 2)) %>% mutate(`Mean (SD)`= paste0(mean, " (", sd, ")")) %>% select(Year, Local.Site.Name, `Mean (SD)`) %>% pivot_wider( names_from = Year, values_from = `Mean (SD)`, names_prefix = "Mean (SD) in ") %>% mutate(`Local Site Name` = Local.Site.Name) %>% select(`Local Site Name`, `Mean (SD) in 2002`, `Mean (SD) in 2022`) %>% arrange(`Local Site Name`)`summarise()` has grouped output by 'Year'. You can override using the `.groups` argument.kable(as.data.frame(summary5), align = "c")Local Site Name Mean (SD) in 2002 Mean (SD) in 2022 Azusa 20.76 (12.11) 9.72 (4.39) Burbank 23.97 (12.74) NA Compton NA 12.99 (6.22) Glendora NA 8.42 (5.47) Lancaster-Division Street 10.38 (4.47) 7.52 (2.48) Lebec 4.82 (2.78) 3.5 (1.65) Long Beach (North) 19.47 (10.39) 9.92 (3.27) Long Beach (South) NA 11.97 (4.32) Long Beach-Route 710 Near Road NA 13.02 (5.61) Los Angeles-North Main Street 21.97 (11.69) 11.58 (4.57) Lynwood 23.35 (11.96) NA North Hollywood (NOHO) NA 13.02 (4.75) Pasadena 20.29 (11.14) 9.09 (3.68) Pico Rivera #2 NA 11.45 (5.93) Reseda 18.85 (10.65) 10.72 (4.56) Santa Clarita NA 9.14 (3.9) Signal Hill (LBSH) NA 8.85 (4.42) ggplot(df %>% filter(County == "Los Angeles", Year %in% c("2002", "2022"), Local.Site.Name != ""), aes(x = PM2.5, y = Local.Site.Name, fill = Year)) + geom_violin(alpha = 0.6, scale = "width", position = position_dodge(width = 0.8), trim = FALSE) + labs( title = "Daily PM2.5 Concentration in LA County", subtitle = "By monitoring sites (2002 vs 2022)", x = "Daily PM2.5 Concentration", y = "Site", fill = "Year") + scale_color_manual(name = "Year", values = c("2002" = "#F8766D", "2022" = "#00BFC4")) + scale_fill_manual(name = "Year", values = c("2002" = "#F8766D", "2022" = "#00BFC4")) + theme_minimal() + theme( legend.position = "bottom", legend.box = "horizontal", legend.justification = "center" )Warning: No shared levels found between `names(values)` of the manual scale and the data's colour values.
Reminder: after you upload your final rendered document to GitHub, you should download it to make sure that it looks right! If you haven’t included embed-resources: true in the YAML header, none of your figures will show up!